library(mice)
library(tidyverse)
train <- read.csv("../clean_data/mci_wv1go_aug.csv")
test <- read.csv("../clean_data/mci_wv23_aug.csv")

Visualization

Overall missing patterns of different waves are quite different. I will impute the train & test set seperately to avoid information leakage

df_bar <- data.frame(variable=names(train), train=colMeans(is.na(train)), test=colMeans(is.na(test)) ) %>%
  pivot_longer(cols=c("train", "test"), names_to = "set", values_to = "Missingness")
ggplot(df_bar) +
  geom_bar(aes(x=reorder(variable, desc(Missingness)), y=Missingness, fill=set), position="dodge", stat = "identity")+
  xlab("Feature") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5))

# md.pattern(train, rotate.names = T)
# md.pattern(test, rotate.names = T)
fluxplot(train)

fluxplot(test)

Imputation for training

pred = quickpred(train)
imp = mice(train, seed=1, m=5, maxit=5, pred=pred, printFlag = F, ridge=0.001)  
## Warning: Number of logged events: 1816
# > # default error: probably due to linear combination according to the warning 
# > set higher ridge to address collinearity
# imp$method    # all use pmm
# diagnostic:
bwplot(imp, layout = c(3, 1))

stripplot(imp, pch = c(21, 20), cex = c(1, 1.5), layout = c(3, 1))

# export:
for (i in 1:5){
  fl <- complete(imp, m=i)[[1]]
  fl_name <- paste0("../clean_data/impute/train", i, ".csv")
  write.csv(fl, file = fl_name, row.names=FALSE)
}

Imputation for testing

pred = quickpred(test)
imp = mice(test, seed=1, m=5, maxit=5, pred=pred, printFlag = F, ridge=0.001)  
## Warning: Number of logged events: 1431
# > # default error: probably due to linear combination according to the warning 
# > set higher ridge to address collinearity
# imp$method    # all use pmm
# diagnostic:
bwplot(imp, layout = c(3, 1))

stripplot(imp, pch = c(21, 20), cex = c(1, 1.5), layout = c(3, 1))

# export:
for (i in 1:5){
  fl <- complete(imp, m=i)[[1]]
  fl_name <- paste0("../clean_data/impute/test", i, ".csv")
  write.csv(fl, file = fl_name, row.names=FALSE)
}
pred = quickpred(train)
imp = mice(train, seed=1, m=1, maxit=1, pred=pred, printFlag = F, ridge=0.001)  
## Warning: Number of logged events: 66
check_na = complete(imp, m=3)[[1]]
sum(is.na(check_na))
## [1] 0
colSums(is.na(check_na))  
##                  AV45           EcogSPOrgan          EcogSPDivatt 
##                     0                     0                     0 
##         EcogSPVisspat            EcogSPPlan           EcogPtOrgan 
##                     0                     0                     0 
##           EcogSPTotal             EcogSPMem                  MOCA 
##                     0                     0                     0 
##            EcogSPLang          EcogPtDivatt         EcogPtVisspat 
##                     0                     0                     0 
##            EcogPtLang             EcogPtMem           EcogPtTotal 
##                     0                     0                     0 
##            EcogPtPlan             TAU_UPENN            PTAU_UPENN 
##                     0                     0                     0 
##           ABETA_UPENN                   FDG            Entorhinal 
##                     0                     0                     0 
##              Fusiform               MidTemp           Hippocampus 
##                     0                     0                     0 
##            Ventricles            WholeBrain              IMAGEUID 
##                     0                     0                     0 
##                   ICV              TRABSCOR                   FAQ 
##                     0                     0                     0 
##                 CDRSB                ADAS13 RAVLT_perc_forgetting 
##                     0                     0                     0 
##      RAVLT_forgetting              PTETHCAT                 APOE4 
##                     0                     0                     0 
##               PTMARRY             LDELTOTAL       RAVLT_immediate 
##                     0                     0                     0 
##                  MMSE        RAVLT_learning                ADAS11 
##                     0                     0                     0 
##              PTRACCAT          mPACCtrailsB            mPACCdigit 
##                     0                     0                     0 
##                ADASQ4              PTEDUCAT              PTGENDER 
##                     0                     0                     0 
##                   AGE               VISCODE                  SITE 
##                     0                     0                     0 
##              progress 
##                     0